Brian M. Schilder, Bioinformatician II
# Import functions
root <- "./"
source(file.path(root,"general_functions.R"))
import_parameters(params)## **** __Utilized Cores__ **** = 4$subsetGenes
## [1] "protein_coding"
##
## $subsetCells
## [1] "400"
##
## $resolution
## [1] "0.2"
##
## $resultsPath
## [1] "Results/subsetGenes-protein_coding__subsetCells-400__Resolution-0.2__perplexity-30__nCores-4"
##
## $nCores
## [1] 4
##
## $perplexity
## [1] "30"
__Results/subsetGenes-protein_coding__subsetCells-400__Resolution-0.2__perplexity-30nCores-4
library(Seurat)
library(dplyr)
library(gridExtra)
library(knitr)
library(plotly)
library(ggplot2)
library(shiny)
library(DT)
sessionInfo()## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] DT_0.5.1 shiny_1.2.0 plotly_4.8.0 knitr_1.21 gridExtra_2.3
## [6] dplyr_0.7.8 Seurat_2.3.4 Matrix_1.2-15 cowplot_0.9.4 ggplot2_3.1.0
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 colorspace_1.4-0 class_7.3-15
## [4] modeltools_0.2-22 ggridges_0.5.1 mclust_5.4.2
## [7] htmlTable_1.13.1 base64enc_0.1-3 rstudioapi_0.9.0
## [10] proxy_0.4-22 npsurv_0.4-0 flexmix_2.3-14
## [13] bit64_0.9-7 mvtnorm_1.0-8 codetools_0.2-16
## [16] splines_3.5.1 R.methodsS3_1.7.1 lsei_1.2-0
## [19] robustbase_0.93-3 jsonlite_1.6 Formula_1.2-3
## [22] ica_1.0-2 cluster_2.0.7-1 kernlab_0.9-27
## [25] png_0.1-7 R.oo_1.22.0 compiler_3.5.1
## [28] httr_1.4.0 backports_1.1.3 assertthat_0.2.0
## [31] lazyeval_0.2.1 later_0.8.0 lars_1.2
## [34] acepack_1.4.1 htmltools_0.3.6 tools_3.5.1
## [37] bindrcpp_0.2.2 igraph_1.2.2 gtable_0.2.0
## [40] glue_1.3.0 RANN_2.6.1 reshape2_1.4.3
## [43] Rcpp_1.0.0 trimcluster_0.1-2.1 gdata_2.18.0
## [46] ape_5.2 nlme_3.1-137 iterators_1.0.10
## [49] fpc_2.1-11.1 gbRd_0.4-11 lmtest_0.9-36
## [52] xfun_0.4 stringr_1.4.0 mime_0.6
## [55] irlba_2.3.3 gtools_3.8.1 DEoptimR_1.0-8
## [58] MASS_7.3-51.1 zoo_1.8-4 scales_1.0.0
## [61] promises_1.0.1 doSNOW_1.0.16 parallel_3.5.1
## [64] RColorBrewer_1.1-2 yaml_2.2.0 reticulate_1.10
## [67] pbapply_1.4-0 rpart_4.1-13 segmented_0.5-3.0
## [70] latticeExtra_0.6-28 stringi_1.2.4 foreach_1.4.4
## [73] checkmate_1.9.1 caTools_1.17.1.1 bibtex_0.4.2
## [76] Rdpack_0.10-1 SDMTools_1.1-221 rlang_0.3.1
## [79] pkgconfig_2.0.2 dtw_1.20-1 prabclus_2.2-7
## [82] bitops_1.0-6 evaluate_0.12 lattice_0.20-38
## [85] ROCR_1.0-7 purrr_0.3.0 bindr_0.1.1
## [88] htmlwidgets_1.3 bit_1.1-14 tidyselect_0.2.5
## [91] plyr_1.8.4 magrittr_1.5 R6_2.3.0
## [94] snow_0.4-3 gplots_3.0.1.1 Hmisc_4.2-0
## [97] pillar_1.3.1 foreign_0.8-71 withr_2.1.2
## [100] fitdistrplus_1.0-14 mixtools_1.1.0 survival_2.43-3
## [103] nnet_7.3-12 tsne_0.1-3 tibble_2.0.1
## [106] crayon_1.3.4 hdf5r_1.0.1 KernSmooth_2.23-15
## [109] rmarkdown_1.11 grid_3.5.1 data.table_1.12.0
## [112] metap_1.1 digest_0.6.18 diptest_0.75-7
## [115] xtable_1.8-3 httpuv_1.4.5.1 tidyr_0.8.2
## [118] R.utils_2.7.0 stats4_3.5.1 munsell_0.5.0
## [121] viridisLite_0.3.0
print(paste("Seurat ", packageVersion("Seurat")))## [1] "Seurat 2.3.4"
## ! IMPORTANT! Must not setwd to local path when launching on cluster
# setwd("~/Desktop/PD_scRNAseq/")
load(file.path(root, "Data/seurat_object_add_HTO_ids.Rdata"))
DAT <- seurat.obj
rm(seurat.obj)DAT## An object of class seurat in project RAJ_13357
## 24914 genes across 22113 samples.
metadata <- read.table(file.path(root,"Data/meta.data4.tsv"))
createDT( metadata, caption = "Metadata") ## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
# Make AgeGroups
makeAgeGroups <- function(){
dim(metadata)
getMaxRound <- function(vals=metadata$Age, unit=10)unit*ceiling((max(vals)/unit))
getMinRound <- function(vals=metadata$Age, unit=10)unit*floor((min(vals)/unit))
ageBreaks = c(seq(getMinRound(), getMaxRound(), by = 10), getMaxRound()+10)
AgeGroupsUniq <- c()
for (i in 1:(length(ageBreaks)-1)){
AgeGroupsUniq <- append(AgeGroupsUniq, paste(ageBreaks[i],ageBreaks[i+1], sep="-"))
}
data.table::setDT(metadata,keep.rownames = T,check.names = F)[, AgeGroups := cut(Age,
breaks = ageBreaks,
right = F,
labels = AgeGroupsUniq,
nclude.lowest=T)]
metadata <- data.frame(metadata)
unique(metadata$AgeGroups)
head(metadata)
dim(metadata)
return(metadata)
}
# metadata <- makeAgeGroups()
DAT <- AddMetaData(object = DAT, metadata = metadata)
# Get rid of any NAs (cells that don't match up with the metadata)
if(subsetCells==F){
DAT <- FilterCells(object = DAT, subset.names = "nGene", low.thresholds = 0)
} else {DAT <- FilterCells(object = DAT, subset.names = "nGene", low.thresholds = 0,
# Subset for testing
cells.use = DAT@cell.names[0:subsetCells]
)
} ## Error in if (subsetCells == F) {: missing value where TRUE/FALSE needed
Include only subsets of genes by type. Biotypes from: https://useast.ensembl.org/info/genome/genebuild/biotypes.html
subsetBiotypes <- function(DAT, subsetGenes){
if( subsetGenes!=F ){
cat(paste("Subsetting genes:",subsetGenes, "\n"))
# If the gene_biotypes file exists, import csv. Otherwise, get from biomaRt
if(file_test("-f", file.path(root, "Data/gene_biotypes.csv"))){
biotypes <- read.csv(file.path(root, "Data/gene_biotypes.csv"))
}
else {
ensembl <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org",
dataset="hsapiens_gene_ensembl")
ensembl <- useDataset(mart = ensembl, dataset = "hsapiens_gene_ensembl")
listFilters(ensembl)
listAttributes(ensembl)
biotypes <- getBM(attributes=c("hgnc_symbol", "gene_biotype"), filters="hgnc_symbol",
values=row.names(DAT@data), mart=ensembl)
write.csv(biotypes, file.path(root,"Data/gene_biotypes.csv"), quote=F, row.names=F)
}
# Subset data by creating new Seurat object (annoying but necessary)
geneSubset <- biotypes[biotypes$gene_biotype==subsetGenes,"hgnc_symbol"]
cat(paste(dim(DAT@raw.data[geneSubset, ])[1],"/", dim(DAT@raw.data)[1],
"genes are", subsetGenes))
# Add back into DAT
subset.matrix <- DAT@raw.data[geneSubset, ] # Pull the raw expression matrix from the original Seurat object containing only the genes of interest
DAT_sub <- CreateSeuratObject(subset.matrix) # Create a new Seurat object with just the genes of interest
orig.ident <- row.names(DAT@meta.data) # Pull the identities from the original Seurat object as a data.frame
DAT_sub <- AddMetaData(object = DAT_sub, metadata = DAT@meta.data) # Add the idents to the meta.data slot
DAT_sub <- SetAllIdent(object = DAT_sub, id = "ident") # Assign identities for the new Seurat object
DAT <- DAT_sub
rm(list = c("DAT_sub","geneSubset", "subset.matrix", "orig.ident"))
}
return(DAT)
}
DAT <- subsetBiotypes(DAT, subsetGenes)## Error in if (subsetGenes != F) {: missing value where TRUE/FALSE needed
Filter by cells, normalize , filter by gene variability.
cat("Total Cells:", length(DAT@cell.names), "\n")## Total Cells: 22113
DAT <- FilterCells(object = DAT, subset.names = c("nGene", "percent.mito"),
low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))
cat("Filtered Cells:", length(DAT@cell.names))## Filtered Cells: 19144
DAT <- NormalizeData(object = DAT, normalization.method = "LogNormalize",
scale.factor = 10000)Important!: * In ScaleData… + Specify do.par = F (unless you have parallel processing set up properly, this will cause your script to crash) + Specify num.cores = nCores (to use all available cores, determined by parallel::detectCores())
Regress out: number of unique transcripts (nUMI), % mitochondrial transcripts (percent.mito)
# Store the top most variable genes in @var.genes
DAT <- FindVariableGenes(object = DAT, mean.function = ExpMean, dispersion.function = LogVMR,
x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)cat("Total Genes:", length(row.names(DAT@raw.data)), "\n")## Total Genes: 24914
cat("Highly Variable Genes:", length(DAT@var.genes))## Highly Variable Genes: 2965
# IMPORTANT!: Must set do.par=T and num.cors = n for large datasets being processed on computing clusters
# IMPORTANT!: Use only the var.genes identified by 'FindVariableGenes' as the 'gene.use' arg in 'ScaleData'
## This will greatly reduced the computational load.
# Ensure CD14 and CD16 are included
appendedGenes <- c(DAT@var.genes, "CD14", "FCGR3A")
DAT <- ScaleData(object = DAT, genes.use = appendedGenes , vars.to.regress = c("nUMI", "percent.mito"),
do.par = T, num.cores = params$nCores)## Regressing out: nUMI, percent.mito
##
## Time Elapsed: 16.4969220161438 secs
## Scaling data matrix
DAT## An object of class seurat in project RAJ_13357
## 24914 genes across 19144 samples.
vp <- VlnPlot(object = DAT, features.plot = c("nGene", "nUMI", "percent.mito"),nCol = 3, do.return = T) %>% + ggplot2::aes(alpha=0.5)
vp# results = 'hide', fig.show='hide'
# par(mfrow = c(1, 2))
# do.hover <- ifelse(interactive==T, T, F)
GenePlot(object = DAT, gene1 = "nUMI", gene2 = "percent.mito", pch.use=20) #do.hover=do.hover, data.hover = "mut")# , results = 'hide', fig.show='hide'
# do.hover <-ifelse(interactive==T, T, F)
GenePlot(object = DAT, gene1 = "nUMI", gene2 = "nGene", pch.use=20) #do.hover=do.hover, data.hover = "mut")ProjectPCA scores each gene in the dataset (including genes not included in the PCA) based on their correlation with the calculated components. Though we don’t use this further here, it can be used to identify markers that are strongly correlated with cellular heterogeneity, but may not have passed through variable gene selection. The results of the projected PCA can be explored by setting use.full=T in the functions above
# Run PCA with only the top most variables genes
DAT <- RunPCA(object = DAT, pc.genes = DAT@var.genes, do.print=F, verbose=F) #, pcs.print = 1:5, genes.print = 5
# Store in Seurat object so you don't have to recalculate it for the tSNE/UMAP steps
DAT <- ProjectPCA(object = DAT, do.print=F) VizPCA(object = DAT, pcs.use = 1:2)# 'PCHeatmap' is a wrapper for heatmap.2
PCHeatmap(object = DAT, pc.use = 1:12,do.balanced=T, label.columns=F, use.full=F) # cells.use = 500, Determine statistically significant PCs for further analysis. NOTE: This process can take a long time for big datasets, comment out for expediency. More approximate techniques such as those implemented in PCElbowPlot() can be used to reduce computation time
#DAT <- JackStraw(object = DAT, num.replicate = 100, display.progress = FALSE)
PCElbowPlot(object = DAT)Algorithm for modularity optimization (1 = original Louvain algorithm; 2 = Louvain algorithm with multilevel refinement; 3 = SLM algorithm).
The FindClusters function implements the procedure, and contains a resolution parameter that sets the ‘granularity’ of the downstream clustering, with increased values leading to a greater number of clusters. We find that setting this parameter between 0.6-1.2 typically returns good results for single cell datasets of around 3K cells. Optimal resolution often increases for larger datasets. The clusters are saved in the object@ident slot.
DAT <- RunTSNE(object=DAT, reduction.use = "pca", dims.use = 1:10, do.fast = TRUE,
perplexity = perplexity, tsne.method = "Rtsne", num_threads=nCores, verbose=F) # FItSNE ## Error in .check_tsne_params(nrow(X), dims = dims, perplexity = perplexity, : perplexity should be a positive number
# TRY DIFFERENT RESOLUTIONS
DAT <- StashIdent(object = DAT, save.name = "pre_clustering")
# DAT <- SetAllIdent(object = DAT, id = "pre_clustering")
DAT <- FindClusters(object = DAT, reduction.type = "pca", dims.use = 1:10, algorithm = 1,
resolution = resolution, print.output = F, save.SNN = T,
n.start = 10, nn.eps = 0.5, plot.SNN = T, force.recalc=T) ## Warning in BuildSNN(object = object, genes.use = genes.use, reduction.type
## = reduction.type, : Please compute a tSNE for SNN visualization. See
## RunTSNE().
## Error in RunModularityClusteringCpp(SNN, modularity, resolution, algorithm, : Not compatible with requested type: [type=character; target=double].
PrintFindClustersParams(object = DAT) ## Error in PrintFindClustersParams(object = DAT): No stored clusterings.
DAT <- StashIdent(object = DAT, save.name = "post_clustering") # do.hover <-ifelse(interactive==T, T, F)
PCAPlot(object = DAT, dim.1 = 1, dim.2 = 2, group.by="post_clustering")#, do.hover=do.hover, data.hover="mut")Additional UMAP arguments detailed here: https://umap-learn.readthedocs.io/en/latest/api.html#module-umap.umap_
# cat(print(mem_used()))
DAT <- RunUMAP(object = DAT, reduction.use = "pca", dims.use = 1:10, verbose=TRUE, num_threads=nCores) # , num_threads=0
# cat(print(mem_used()))
# Plot results
DimPlot(object = DAT, reduction.use = 'umap')As input to the tSNE, we suggest using the same PCs as input to the clustering analysis, although computing the tSNE based on scaled gene expression is also supported using the genes.use argument.
Important!: Specify num_threads=0 in ‘RunTSNE’ to use all available cores.
“FItSNE”, a new fast implementation of t-SNE, is also available through RunTSNE. However FItSNE must first be setup on your computer.
labSize <- 12
customColors <- function(var="post_clustering", palette="Set1"){
add.alpha <- function(col, alpha=1){
if(missing(col))
stop("Please provide a vector of colours.")
apply(sapply(col, col2rgb)/255, 2,
function(x)
rgb(x[1], x[2], x[3], alpha=alpha))
}
cluster_colors <- RColorBrewer::brewer.pal( length(unique(DAT@meta.data[var])), palette)
cluster_colors_transparent <- add.alpha(cluster_colors, .5) %>% as.character()
return(cluster_colors_transparent)
}
# Try t-SNE at different perplexities
for (i in c(perplexity,5,20,30,100)){
cat('\n')
cat("### t-SNE: perplexity =",i,"\n")
DAT <- RunTSNE(object=DAT, reduction.use = "pca", dims.use = 1:10, do.fast = TRUE,
perplexity = i, tsne.method = "Rtsne", num_threads=nCores, verbose=F) # FItSNE
tsnePlot <- TSNEPlot(object = DAT, do.label=T, label.size = labSize, do.return=T) +
scale_color_brewer( palette = "Set1", aesthetics = aes(alpha=.5))
print(tsnePlot)
cat('\n')
} ## Error in .check_tsne_params(nrow(X), dims = dims, perplexity = perplexity, : perplexity should be a positive number
tSNE_metadata_plot <- function(var, labSize=12){
# Metadata plot
p1 <- TSNEPlot(DAT, do.return = T, do.label = T, group.by = var,label.size = labSize,
plot.title=paste(var), vector.friendly=T) +
theme(legend.position = "top") +
scale_color_brewer( palette = "Dark2", aesthetics = aes(alpha=.5))
# t-SNE clusters plot
p2 <- TSNEPlot(DAT, do.return = T, do.label = T,label.size = labSize,
plot.title=paste("Unsupervised Clusters"), vector.friendly=T) +
theme(legend.position = "top") +
scale_color_brewer( palette = "Set1", aesthetics = aes(alpha=.5))
print(plot_grid(p1,p2))
}
# Iterate plots over metadata variables
metaVars <- c("dx","mut","Gender","Age", "ID")
for (var in metaVars){
cat('\n')
cat("### t-SNE Metadata plot for ",var, "\n")
tSNE_metadata_plot(var)
cat('\n')
} ## Error in GetDimReduction(object = object, reduction.type = reduction.use, : tsne dimensional reduction has not been computed
save.image(file.path("scRNAseq_results.RData"))